
#include "sparc_matrix.h"

	.register %g2, #scratch
	.register %g3, #scratch

	.text

#ifdef __arch64__
#define STACK_VAR_OFF	(2047 + (8 * 16))
#else
#define STACK_VAR_OFF	(4 * 16)
#endif

	/* Newton-Raphson approximation turns out to be slower
	 * (and less accurate) than direct fsqrts/fdivs.
	 */
#define ONE_DOT_ZERO	0x3f800000

	.globl	_mesa_sparc_transform_normalize_normals
_mesa_sparc_transform_normalize_normals:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */

	sethi	%hi(ONE_DOT_ZERO), %g2
	sub	%sp, 16, %sp
	st	%g2, [%sp + STACK_VAR_OFF+0x0]
	st	%o1, [%sp + STACK_VAR_OFF+0x4]
	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
	ld	[%sp + STACK_VAR_OFF+0x4], %f15	! f15 = scale
	add	%sp, 16, %sp

	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 cmp	%o3, 0
	bne	4f
	 clr	%o4				! 'i' for STRIDE_LOOP

1:	/* LENGTHS == NULL */
	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
	 * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
	 * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
	 */
	fmuls	%f0, M0, %f3			! FGM	Group
	fmuls	%f1, M1, %f4			! FGM	Group
	fmuls	%f0, M4, %f5			! FGM	Group
	fmuls	%f1, M5, %f6			! FGM	Group
	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
	fadds	%f3, %f4, %f3			! FGA
	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
	fadds	%f5, %f6, %f5			! FGA
	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available

	/* f3=tx, f5=ty, f7=tz */

	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
	fmuls	%f3, %f3, %f6			! FGM	Group	f3 available
	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available

	/* scale (f6) = 1.0 / sqrt(len) */
	fsqrts	%f6, %f6			! FDIV  20 cycles
	fdivs	%f12, %f6, %f6			! FDIV	14 cycles

	fmuls	%f3, %f6, %f3
	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
	fmuls	%f5, %f6, %f5
	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
	fmuls	%f7, %f6, %f7
	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

	ba	7f
	 nop

4:	/* LENGTHS != NULL */
	fmuls	M0, %f15, M0
	fmuls	M1, %f15, M1
	fmuls	M2, %f15, M2
	fmuls	M4, %f15, M4
	fmuls	M5, %f15, M5
	fmuls	M6, %f15, M6
	fmuls	M8, %f15, M8
	fmuls	M9, %f15, M9
	fmuls	M10, %f15, M10

5:
	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
	 * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
	 * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
	 */
	fmuls	%f0, M0, %f3			! FGM	Group
	fmuls	%f1, M1, %f4			! FGM	Group
	fmuls	%f0, M4, %f5			! FGM	Group
	fmuls	%f1, M5, %f6			! FGM	Group
	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
	fadds	%f3, %f4, %f3			! FGA
	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
	fadds	%f5, %f6, %f5			! FGA
	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
	ld	[%o3], %f13			! LSU
	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
	add	%o3, 4, %o3			! IEU0
	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available

	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */

	fmuls	%f3, %f13, %f3
	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
	fmuls	%f5, %f13, %f5
	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
	fmuls	%f7, %f13, %f7
	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len

	cmp	%o4, %g1			! continue if (i < count)
	bl	5b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop
	
	.globl	_mesa_sparc_transform_normalize_normals_no_rot
_mesa_sparc_transform_normalize_normals_no_rot:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */

	sethi	%hi(ONE_DOT_ZERO), %g2
	sub	%sp, 16, %sp
	st	%g2, [%sp + STACK_VAR_OFF+0x0]
	st	%o1, [%sp + STACK_VAR_OFF+0x4]
	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
	ld	[%sp + STACK_VAR_OFF+0x4], %f15	! f15 = scale
	add	%sp, 16, %sp

	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	LDMATRIX_0_5_10(%o0)

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 cmp	%o3, 0
	bne	4f
	 clr	%o4				! 'i' for STRIDE_LOOP

1:	/* LENGTHS == NULL */
	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* tx (f3) = (ux * m0)
	 * ty (f5) = (uy * m5)
	 * tz (f7) = (uz * m10)
	 */
	fmuls	%f0, M0, %f3			! FGM	Group
	fmuls	%f1, M5, %f5			! FGM	Group
	fmuls	%f2, M10, %f7			! FGM	Group

	/* f3=tx, f5=ty, f7=tz */

	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
	fmuls	%f3, %f3, %f6			! FGM	Group	stall, f3 available
	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available

	/* scale (f6) = 1.0 / sqrt(len) */
	fsqrts	%f6, %f6			! FDIV  20 cycles
	fdivs	%f12, %f6, %f6			! FDIV	14 cycles

	fmuls	%f3, %f6, %f3
	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
	fmuls	%f5, %f6, %f5
	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
	fmuls	%f7, %f6, %f7
	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

	ba	7f
	 nop

4:	/* LENGTHS != NULL */
	fmuls	M0, %f15, M0
	fmuls	M5, %f15, M5
	fmuls	M10, %f15, M10

5:
	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* tx (f3) = (ux * m0)
	 * ty (f5) = (uy * m5)
	 * tz (f7) = (uz * m10)
	 */
	fmuls	%f0, M0, %f3			! FGM	Group
	ld	[%o3], %f13			! LSU
	fmuls	%f1, M5, %f5			! FGM	Group
	add	%o3, 4, %o3			! IEU0
	fmuls	%f2, M10, %f7			! FGM	Group

	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */

	fmuls	%f3, %f13, %f3
	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
	fmuls	%f5, %f13, %f5
	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
	fmuls	%f7, %f13, %f7
	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len

	cmp	%o4, %g1			! continue if (i < count)
	bl	5b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop

	.globl	_mesa_sparc_transform_rescale_normals_no_rot
_mesa_sparc_transform_rescale_normals_no_rot:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
	sub	%sp, 16, %sp
	st	%o1, [%sp + STACK_VAR_OFF+0x0]
	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
	add	%sp, 16, %sp

	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	LDMATRIX_0_5_10(%o0)

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 clr	%o4				! 'i' for STRIDE_LOOP

	fmuls	M0, %f15, M0
	fmuls	M5, %f15, M5
	fmuls	M10, %f15, M10

1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* tx (f3) = (ux * m0)
	 * ty (f5) = (uy * m5)
	 * tz (f7) = (uz * m10)
	 */
	fmuls	%f0, M0, %f3			! FGM	Group
	st	%f3, [%g3 + 0x00]		! LSU
	fmuls	%f1, M5, %f5			! FGM	Group
	st	%f5, [%g3 + 0x04]		! LSU
	fmuls	%f2, M10, %f7			! FGM	Group
	st	%f7, [%g3 + 0x08]		! LSU

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop

	.globl	_mesa_sparc_transform_rescale_normals
_mesa_sparc_transform_rescale_normals:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
	sub	%sp, 16, %sp
	st	%o1, [%sp + STACK_VAR_OFF+0x0]
	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
	add	%sp, 16, %sp

	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 clr	%o4				! 'i' for STRIDE_LOOP

	fmuls	M0, %f15, M0
	fmuls	M1, %f15, M1
	fmuls	M2, %f15, M2
	fmuls	M4, %f15, M4
	fmuls	M5, %f15, M5
	fmuls	M6, %f15, M6
	fmuls	M8, %f15, M8
	fmuls	M9, %f15, M9
	fmuls	M10, %f15, M10

1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	fmuls	%f0, M0, %f3			! FGM	Group
	fmuls	%f1, M1, %f4			! FGM	Group
	fmuls	%f0, M4, %f5			! FGM	Group
	fmuls	%f1, M5, %f6			! FGM	Group
	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
	fadds	%f3, %f4, %f3			! FGA
	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
	fadds	%f5, %f6, %f5			! FGA
	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
	st	%f3, [%g3 + 0x00]		! LSU
	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
	st	%f5, [%g3 + 0x04]		! LSU
	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
	st	%f7, [%g3 + 0x08]		! LSU

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop

	.globl	_mesa_sparc_transform_normals_no_rot
_mesa_sparc_transform_normals_no_rot:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	LDMATRIX_0_5_10(%o0)

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 clr	%o4				! 'i' for STRIDE_LOOP

1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* tx (f3) = (ux * m0)
	 * ty (f5) = (uy * m5)
	 * tz (f7) = (uz * m10)
	 */
	fmuls	%f0, M0, %f3			! FGM	Group
	st	%f3, [%g3 + 0x00]		! LSU
	fmuls	%f1, M5, %f5			! FGM	Group
	st	%f5, [%g3 + 0x04]		! LSU
	fmuls	%f2, M10, %f7			! FGM	Group
	st	%f7, [%g3 + 0x08]		! LSU

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop

	.globl	_mesa_sparc_transform_normals
_mesa_sparc_transform_normals:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 clr	%o4				! 'i' for STRIDE_LOOP

1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
	ld	[%o5 + 0x04], %f1		! uy = from[1]
	ld	[%o5 + 0x08], %f2		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	fmuls	%f0, M0, %f3			! FGM	Group
	fmuls	%f1, M1, %f4			! FGM	Group
	fmuls	%f0, M4, %f5			! FGM	Group
	fmuls	%f1, M5, %f6			! FGM	Group
	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
	fadds	%f3, %f4, %f3			! FGA
	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
	fadds	%f5, %f6, %f5			! FGA
	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
	st	%f3, [%g3 + 0x00]		! LSU
	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
	st	%f5, [%g3 + 0x04]		! LSU
	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
	st	%f7, [%g3 + 0x08]		! LSU

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop

	.globl	_mesa_sparc_normalize_normals
_mesa_sparc_normalize_normals:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */

	sethi	%hi(ONE_DOT_ZERO), %g2
	sub	%sp, 16, %sp
	st	%g2, [%sp + STACK_VAR_OFF+0x0]
	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
	add	%sp, 16, %sp

	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 cmp	%o3, 0
	bne	4f
	 clr	%o4				! 'i' for STRIDE_LOOP

1:	/* LENGTHS == NULL */
	ld	[%o5 + 0x00], %f3		! ux = from[0]
	ld	[%o5 + 0x04], %f5		! uy = from[1]
	ld	[%o5 + 0x08], %f7		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* f3=tx, f5=ty, f7=tz */

	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
	fmuls	%f3, %f3, %f6			! FGM	Group	f3 available
	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available

	/* scale (f6) = 1.0 / sqrt(len) */
	fsqrts	%f6, %f6			! FDIV  20 cycles
	fdivs	%f12, %f6, %f6			! FDIV	14 cycles

	fmuls	%f3, %f6, %f3
	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
	fmuls	%f5, %f6, %f5
	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
	fmuls	%f7, %f6, %f7
	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

	ba	7f
	 nop

4:	/* LENGTHS != NULL */

5:
	ld	[%o5 + 0x00], %f3		! ux = from[0]
	ld	[%o5 + 0x04], %f5		! uy = from[1]
	ld	[%o5 + 0x08], %f7		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	ld	[%o3], %f13			! LSU
	add	%o3, 4, %o3			! IEU0

	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */

	fmuls	%f3, %f13, %f3
	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
	fmuls	%f5, %f13, %f5
	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
	fmuls	%f7, %f13, %f7
	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len

	cmp	%o4, %g1			! continue if (i < count)
	bl	5b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop

	.globl	_mesa_sparc_rescale_normals
_mesa_sparc_rescale_normals:
	/* o0=mat o1=scale o2=in o3=lengths o4=dest */

	sethi	%hi(ONE_DOT_ZERO), %g2
	sub	%sp, 16, %sp
	st	%o1, [%sp + STACK_VAR_OFF+0x0]
	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
	add	%sp, 16, %sp

	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start

	/* dest->count = in->count */
	st	%g1, [%o4 + V4F_COUNT]

	cmp	%g1, 1
	bl	7f
	 clr	%o4				! 'i' for STRIDE_LOOP

1:
	ld	[%o5 + 0x00], %f3		! ux = from[0]
	ld	[%o5 + 0x04], %f5		! uy = from[1]
	ld	[%o5 + 0x08], %f7		! uz = from[2]
	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
	add	%o4, 1, %o4			! i++

	/* f3=tx, f5=ty, f7=tz */

	fmuls	%f3, %f15, %f3
	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
	fmuls	%f5, %f15, %f5
	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
	fmuls	%f7, %f15, %f7
	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale

	cmp	%o4, %g1			! continue if (i < count)
	bl	1b
	 add	%g3, 0x10, %g3			! advance out vector pointer

7:	retl
	 nop
